#' Original outlierKD function by By Klodian Dhana,
#' https://www.r-bloggers.com/identify-describe-plot-and-remove-the-outliers-from-the-dataset/
#' Modified to have third argument for removing outliers instead of interactive prompt,
#' and after removing outlier, original df will not be changed. The function returns the a df,
#' which can be saved as original df name if desired.
#' Also added QQ-plot in the output, with options to show/hide boxplot, histogram, qqplot.
#' Check outliers, and option to remove them, save as a new dataframe.
#' @param df The dataframe.
#' @param var The variable in the dataframe to be checked for outliers
#' @param rm Boolean. Whether to remove outliers or not.
#' @param boxplt Boolean. Whether to show the boxplot, before and after outliers removed.
#' @param histogram Boolean. Whether to show the histogram, before and after outliers removed.
#' @param qqplt Boolean. Whether to show the qqplot, before and after outliers removed.
#' @return The dataframe with outliers replaced by NA if rm==TRUE, or df if nothing changed
#' @examples
#' outlierKD2(mydf, height, FALSE, TRUE, TRUE, TRUE)
#' mydf = outlierKD2(mydf, height, TRUE, TRUE, TRUE, TRUE)
#' mydfnew = outlierKD2(mydf, height, TRUE)
#' @export
outlierKD2 <- function(df, var, rm=TRUE, boxplt=TRUE, histogram=TRUE, qqplt=TRUE) {
dt = df # duplicate the dataframe for potential alteration
var_name <- eval(substitute(var),eval(dt))
na1 <- sum(is.na(var_name))
m1 <- mean(var_name, na.rm = T)
colTotal <- boxplt+histogram+qqplt
par(mfrow=c(2, max(2,colTotal)), oma=c(0,0,3,0)) # fixed issue with only 0 or 1 chart selected
if (qqplt) {
qqnorm(var_name, main = "With outliers")
qqline(var_name)
}
if (histogram) { hist(var_name, main="With outliers", xlab=NA, ylab=NA) }
if (boxplt) { boxplot(var_name, main="With outliers") }
outlier <- boxplot.stats(var_name)$out
mo <- mean(outlier)
var_name <- ifelse(var_name %in% outlier, NA, var_name)
if (qqplt) {
qqnorm(var_name, main = "Without outliers")
qqline(var_name)
}
if (histogram) { hist(var_name, main="Without outliers", xlab=NA, ylab=NA) }
if (boxplt) { boxplot(var_name, main="Without outliers") }
if(colTotal > 0) { # if no charts are wanted, skip this section
title("Outlier Check", outer=TRUE)
na2 <- sum(is.na(var_name))
cat("Outliers identified:", na2 - na1, "\n")
cat("Propotion (%) of outliers:", round((na2 - na1) / sum(!is.na(var_name))*100, 1), "\n")
cat("Mean of the outliers:", round(mo, 2), "\n")
m2 <- mean(var_name, na.rm = T)
cat("Mean without removing outliers:", round(m1, 2), "\n")
cat("Mean if we remove outliers:", round(m2, 2), "\n")
}
# response <- readline(prompt="Do you want to remove outliers and to replace with NA? [yes/no]: ")
# if(response == "y" | response == "yes"){
if(rm){
dt[as.character(substitute(var))] <- invisible(var_name)
#assign(as.character(as.list(match.call())$dt), dt, envir = .GlobalEnv)
cat("Outliers successfully removed", "\n")
return(invisible(dt))
} else {
cat("Nothing changed", "\n")
return(invisible(df))
}
}
There is considerable evidence indicating lending disparities throughout the United States. (Steil et. al: 2018). For our research topic, we will explore lending practices in one of the fastest appreciating real estate markets over the past thirty years - the state of California. Specifically, we aim to look at the factors that are associated with denials for non-commercial mortgage loans.
Our SMART question is:
“Which factors drove denials for mortgages in California in 2019?”
To answer this question, we are using the Federal Financial Institutions Examination Council’s (FFIEC) Home Mortgage Disclosure Act (HMDA) dataset from 2019, located here: https://ffiec.cfpb.gov/data-publication/dynamic-national-loan-level-dataset/2019. We are focusing on a subset of data of 10,000 observations from 2019 that we will further filter on California leaving us with 5,196 observations.
Our Github repository address is:https://github.com/brandonchin19/Team3/.
hmda_ca <- data.frame(read.csv("hmda_ca_new.csv"))
str(hmda_ca)
dim(hmda_ca)
xkabledplyhead(hmda_ca,5)
xkabledplytail(hmda_ca,5)
hmda_ca <- subset(hmda_ca,business_or_commercial_purpose=="2")
str(hmda_ca)
#rename(hmda_ca, ethnicity=derived_ethnicity, race=derived_race,
#sex=derived_sex)
hmda_ca <- subset(hmda_ca,business_or_commercial_purpose=="2")
str(hmda_ca)
hmda_ca <- subset(hmda_ca,occupancy_type=="1")
dim(hmda_ca) #45735 99
#xkabledplyhead(hmda_ca,5)
#xkabledplytail(hmda_ca,5)
loadPkg("sqldf")
names(hmda_ca)
sqldf("select count(distinct(county_code)) from hmda_ca")
unloadPkg("sqldf")
hmda_ca1<-hmda_ca%>%filter(action_taken %in% c("1", "3"))
hmda_ca<-hmda_ca1
dim(hmda_ca) #30661 99
hmda_ca_final <- hmda_ca[c(10,11,12,13,22,24,39,46,50,62,74,78)]
str(hmda_ca_final)
hmda_ca_final_1 = hmda_ca_final
hmda_ca_final_1$derived_ethnicity = factor(hmda_ca_final$derived_ethnicity)
hmda_ca_final_1$derived_race = factor(hmda_ca_final$derived_race)
hmda_ca_final_1$derived_sex = factor(hmda_ca_final$derived_sex)
#hmda_ca_final_1$action_taken = factor(hmda_ca_final$action_taken)
hmda_ca_final_1$loan_amount = as.numeric(hmda_ca_final$loan_amount)
hmda_ca_final_1$interest_rate = as.numeric(hmda_ca_final$interest_rate)
hmda_ca_final_1$property_value = as.numeric(hmda_ca_final$property_value)
hmda_ca_final_1$income = as.numeric(hmda_ca_final$income)
hmda_ca_final_1$applicant_age = factor(hmda_ca_final$applicant_age)
str(hmda_ca_final_1)
#Examine and filter out the missing values
missvalue <- is.na(hmda_ca_final_1)
summary(missvalue)
#There are 30,661 observations after we've filtered on all of the relevant fields.
#Missing values are present in interest_rate: 6,919; property_value: 1,734, and income: 1,302.
hmda_ca_final_2 <- hmda_ca_final_1$interest_rate[is.na(hmda_ca_final_1$interest_rate)] <- mean(hmda_ca_final_1$interest_rate, na.rm = TRUE)
hmda_ca_final_2 <- hmda_ca_final_1 %>% drop_na(applicant_ethnicity.1)
hmda_ca_final_2 <- na.omit(hmda_ca_final_1, cols="income")
missvalue1 <- is.na(hmda_ca_final_2)
summary(missvalue1)
#After cleaning out the missing values, we are left with 28,695 observations
#Exploring the categorical values first
library(ggplot2)
ggplot(hmda_ca_final_2, aes(x = factor(derived_sex))) +
geom_bar(color="black", fill="antiquewhite2")+
labs(title="Graph 1. Applicant sex distribution", x="Applicant Sex", y="Count")
ggplot(hmda_ca_final_2, aes(x = factor(applicant_age))) +
geom_bar(color="black", fill="bisque3")+
labs(title="Graph 2. Applicant age distribution", x="Applicant Age", y="Count")
ggplot(hmda_ca_final_2, aes(x = factor(derived_ethnicity))) +
geom_bar(color="black", fill="cornsilk3")+
labs(title="Graph 3. Applicant ethnicity distribution", x="Applicant Ethnicity", y="Count")
ggplot(hmda_ca_final_2, aes(x = factor(action_taken))) +
geom_bar(color="black", fill="azure3")+
labs(title="Graph 4. Action taken distribution", x="Action Taken", y="Count")
ggplot(hmda_ca_final_2, aes(x = factor(derived_race))) +
geom_bar(color="black", fill="azure3")+
labs(title="Graph 5. Derived Race", x="Race of the Applicant", y= "Count")+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
Next, we proceed to exploring the numerical data for normality and outliers. Graphs 8-10 indicate that none of the numerical variables are normally distributed.
ggplot(hmda_ca_final_2,aes(x=loan_amount))+
geom_histogram(color="black", fill="steelblue")+
labs(title=" Graph 6. Histogram of Loan Amount", x="Loan Amount in dollars", y="Frequency")+theme_bw()
ggplot(hmda_ca_final_2,aes(x=interest_rate))+
geom_histogram(color="black", fill="pink")+
labs(title="Graph 8. Histogram of Interest Rate", x="Interest Rates", y="Frequency")+theme_bw()
ggplot(hmda_ca_final_2,aes(x=income))+
geom_histogram(color="black", fill="azure3")+
labs(title="Graph 9. Histogram of Income", x="Income in dollars", y="Frequency")+theme_bw()
ggplot(hmda_ca_final_2,aes(x=property_value))+
geom_histogram(color="black", fill="lightblue")+
labs(title="Graph 10. Histogram of Property Values", x="Property Values in dollars", y="Frequency")+theme_bw()
As another check, we inspect qqnorm plots and confirm that the distributions are not normal(Graph 11-14)
qqnorm(hmda_ca_final_2$interest_rate,
main="Graph 11. QQ Plot of Interest Rates",
ylab="Interest Rate",
col="pink")
qqline(hmda_ca_final_2$interest_rate)
qqnorm(hmda_ca_final_2$income,
main="Graph 13. QQ Plot of Income",
ylab="Income",
col="green")
qqline(hmda_ca_final_2$income)
qqnorm(hmda_ca_final_2$property_value,
main="Graph 12. QQ Plot of Property Values",
ylab="Property Value",
col="blue")
qqline(hmda_ca_final_2$property_value)
qqnorm(hmda_ca_final_2$loan_amount,
main="Graph 14.QQ Plot of Loan Amount",
ylab="Loan Amount",
col="purple")
qqline(hmda_ca_final_1$loan_amount)
Next, we remove outliers and check for normality again.
hmda_ca_final_no_outliers_3 <- outlierKD2(hmda_ca_final_2,interest_rate)
hmda_ca_final_no_outliers_4 <- outlierKD2(hmda_ca_final_no_outliers_3,property_value)
hmda_ca_final_no_outliers_5 <- outlierKD2(hmda_ca_final_no_outliers_4,income)
loans <- outlierKD2(hmda_ca_final_no_outliers_5,loan_amount)
We then reexamine the variables for normality after removing the outliers.
qqnorm(loans$interest_rate,
main="Graph 15. QQ Plot of Interest Rates",
ylab="Interest Rate",
col="pink")
qqline(loans$interest_rate)
qqnorm(loans$property_value,
main="Graph 16. QQ Plot of Property Values",
ylab="Property Value",
col="blue")
qqline(loans$property_value)
qqnorm(loans$income,
main="Graph 17.QQ Plot of Income",
ylab="Income",
col="green")
qqline(loans$income)
qqnorm(loans$loan_amount,
main="Graph 18. QQ Plot of Loan Amount",
ylab="Loan Amount",
col="purple")
qqline(loans$loan_amount)
The values do not appear to be normally distributed even after removing the outliers
Numerical_var <- subset(loans,select=c(loan_amount, income, property_value, interest_rate))
#str(Numerical_var)
library(kableExtra)
summary_t<-kbl(summary(Numerical_var))%>%
kable_styling()
summary_t
# GROUP: what's with the NAs in the summary table after we have cleaned the NAs out?
However, the means and the medians are fairly close in all instances; do we keep removing the outliers?
#this chunk is for visual inspection only: EITHER DELETE or FORMAT before the final submission
boxplot(loans$interest_rate)
boxplot(loans$loan_amount)
boxplot(loans$property_value)
boxplot(loans$income)
library(lattice)
#dim(loans) #28695 12
pairs(loans)
#str(loans)
library(epiDisplay)
tab1(loans$derived_race, sort.group = "decreasing", cum.percent = TRUE)
##Preparing data for corrplot
cor_loans<-cor(loans_all_num, use="complete.obs")
xkabledply(cor_loans)
loadPkg("corrplot")
corrplot(cor_loans)
contable1 = table(loans$derived_race, loans$action_taken)
xkabledply(contable1, title="Contingency table for Loan Approval and Race")
| 1 | 3 | |
|---|---|---|
| 2 or more minority races | 75 | 52 |
| American Indian or Alaska Native | 170 | 88 |
| Asian | 2672 | 824 |
| Black or African American | 1242 | 663 |
| Joint | 1062 | 287 |
| Native Hawaiian or Other Pacific Islander | 158 | 85 |
| Race Not Available | 3287 | 1008 |
| White | 14107 | 2915 |
chitest1 = chisq.test(contable1)
chitest1
##
## Pearson's Chi-squared test
##
## data: contable1
## X-squared = 492, df = 7, p-value <2e-16
contable2 = table(loans$derived_sex, loans$action_taken)
xkabledply(contable2, title="Contingency table for Loan Approval and Sex")
| 1 | 3 | |
|---|---|---|
| Female | 4639 | 1346 |
| Joint | 10354 | 2065 |
| Male | 6771 | 2161 |
| Sex Not Available | 1009 | 350 |
chitest2 = chisq.test(contable2)
chitest2
##
## Pearson's Chi-squared test
##
## data: contable2
## X-squared = 225, df = 3, p-value <2e-16
contable3 = table(loans$derived_ethnicity, loans$action_taken)
xkabledply(contable3, title="Contingency table for Loan Approval and Ethnicity")
| 1 | 3 | |
|---|---|---|
| Ethnicity Not Available | 2891 | 830 |
| Hispanic or Latino | 4145 | 1291 |
| Joint | 1229 | 285 |
| Not Hispanic or Latino | 14508 | 3516 |
chitest3 = chisq.test(contable3)
chitest3
##
## Pearson's Chi-squared test
##
## data: contable3
## X-squared = 56, df = 3, p-value = 5e-12
contable4 = table(loans$applicant_age, loans$action_taken)
xkabledply(contable4, title="Contingency table for Loan Approval and Age")
| 1 | 3 | |
|---|---|---|
| <25 | 253 | 58 |
| >74 | 1621 | 373 |
| 25-34 | 3897 | 823 |
| 35-44 | 5606 | 1417 |
| 45-54 | 4924 | 1382 |
| 55-64 | 3674 | 1089 |
| 65-74 | 2798 | 779 |
| 8888 | 0 | 1 |
chitest4 = chisq.test(contable4)
chitest4
##
## Pearson's Chi-squared test
##
## data: contable4
## X-squared = 63, df = 7, p-value = 4e-11
#adding New Code for final and downsampling
hmda <- data.frame(loans)
str(hmda$action_taken)
hmda$approval<- ifelse(hmda$action_taken=="1", "Approved","Denied")
hmda$approval<- factor(hmda$approval)
str(hmda)
table(hmda$approval)
#Approved Denied
#22773 5922
Checking frequency of approval variable and derived_race variable
with(hmda,
{
print(table(derived_race))
print(table(approval))
}
)
Creating DownSample of hmda dataset, the Approved far outnumber the Denied making this dataset severely unbalanced #Approved: 22773 Denied: 5922
Approved<- which(hmda$action_taken=="1")
Denied<- which(hmda$action_taken=="3")
length(Approved)
length(Denied)
approved_downsample<-sample(Approved,length(Denied))
hmda_down<- hmda[c(approved_downsample, Denied),]
str(hmda_down)
hmda_downnum<-hmda_down[c(4:11, 13:17, 19)]
hmda_downnum$approved<- ifelse(hmda_downnum$action_taken=="1", 1, 0)
hmda_downnum$denied<- ifelse(hmda_downnum$action_taken=="3", 1, 0)
str(hmda_downnum)
cor_loans<-cor(hmda_downnum, use="complete.obs")
xkabledply(cor_loans)
loadPkg("corrplot")
corrplot(cor_loans)
Conducting Chi-squared Test on the downsampled dataset (hmda_down)
P-Value for contable1 is lower than .05 therefore we reject the null hypothesis. action_taken and derived_race are not independent of each other
contable1= table(hmda_down$derived_race, hmda_down$action_taken)
xkabledply(contable1, title="Contingency table for Loan Approval and Race")
chitest1 = chisq.test(contable1)
chitest1
contable2= table(hmda_down$derived_ethnicity, hmda_down$action_taken)
xkabledply(contable2, title="Contingency table for Loan Approval and Ethnicity")
chitest2 = chisq.test(contable2)
chitest2
contable3= table(hmda_down$derived_sex, hmda_down$action_taken)
xkabledply(contable3, title="Contingency table for Loan Approval and Gender")
chitest3 = chisq.test(contable3)
chitest3
contable4= table(hmda_down$applicant_age, hmda_down$action_taken)
xkabledply(contable4, title="Contingency table for Loan Approval and Age")
chitest4 = chisq.test(contable4)
chitest4
full_model <- lm(action_taken~., data = loans)
summary(full_model)
loans_for_modelling <- loans[1:12]
full_model_2 <- lm(action_taken~., data = loans_for_modelling)
summary(full_model_2)
#{r} #pcr.fit=pcr(action_taken~.,data=loans_for_modelling,scale=FALSE,validation ="CV") #ezids::xkabledplyhead(loans_for_modelling) #summary(pcr.fit) #
str(loans_for_modelling)
action_taken_fit1 <- lm(action_taken ~ derived_race, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit1)
xkablevif(action_taken_fit1)
action_taken_fit2 <- lm(action_taken ~ derived_race + derived_ethnicity, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit2)
action_taken_fit3 <- lm(action_taken ~ derived_race + derived_ethnicity + derived_sex, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit3)
xkablevif(action_taken_fit3)
action_taken_fit4 <- lm(action_taken ~ derived_race + derived_ethnicity + derived_sex + applicant_age, data = loans_for_modelling, family = "binomial")
summary(action_taken_fit4)
xkablevif(action_taken_fit4)
The multiple R^2s increase with each successive model, indicating slight improvements from action_taken_fit1 through action_taken_fit4.
anova(action_taken_fit1,action_taken_fit2,action_taken_fit3,action_taken_fit4) -> anovaRes
str(anovaRes)
xkabledply(anovaRes, title = "ANOVA comparison between the models")
The residuals decrease with each successive linear model. This suggests that action_taken_fit4 is a better regression model for explaining this data.
loans_for_modelling = loans_for_modelling
loans_for_modelling$action_taken = factor(loans_for_modelling$action_taken)
hmda_down = hmda_down
hmda_down$action_taken = factor(hmda_down$action_taken)
approval_logit <- glm(action_taken ~ derived_race + derived_ethnicity + derived_sex + loan_amount + income, data = hmda_down, family = "binomial")
summary(approval_logit)
xkablevif(approval_logit)
approval_logit_1 <- glm(action_taken ~ derived_race, data = hmda_down, family = "binomial")
summary(approval_logit_1)
xkablevif(approval_logit_1)
#define new observations
newdata1 = data.frame(derived_race = 'Black or African American', derived_sex = 'Female', loan_amount = 100000, derived_ethnicity = 'Not Hispanic or Latino', income = 45000)
newdata2 = data.frame(derived_race = 'Black or African American')
newdata3 = data.frame(derived_race = 'White')
newdata1 = predict(approval_logit, newdata1, type="response")
newdata2 = predict(approval_logit_1, newdata2, type="response")
newdata3 = predict(approval_logit_1, newdata3, type="response")
newdata1
newdata2
newdata3
summary(hmda_down$action_taken)
#SideNotes on ANOVA Testing These are some constructs of my ANOVA test I left them here just incase I need to revist them, these might not be perfect because I ran multiple different variants, I assume it will not be used ultimately and if we need too we can run chi-squared on the balanced dataset.
Conducting ANOVA Test on action_taken variable and income, saved test as anova
anova= aov(income ~ approval, data=hmda) str(anova)
#Plotting ANOVA TEST
loadPkg(“ggplot2”) ggplot(hmda, aes(x=approval, y=income)) + geom_boxplot( colour=c(“#ff0000” ,“#11cc11”)) + labs(title=“Income difference between Approved and Denied Apps”, x=“Approval Status”, y = “Income”)
#plot(income ~ action_taken, data=hmda) anova= aov(action_taken ~ Black, data=hmda) # anovaRes # this does not give the easy-to-read result of the aov analysis names(anova) # summary(anovaRes) xkabledply(anova) # same exact result with or without re-ordering.